Results

Random Forest SSL

# Load required libraries
library(randomForest)
library(caret)
library(caTools)
library(ggplot2)

# Set working directory
setwd("INSERT_PATH_HERE")

# ----------------------------------------
# Data Preparation
# ----------------------------------------

# Load training dataset

data<- read.csv("TrainingData.csv", sep=",", header= TRUE)

# Turn your categorical variable into factor (e.g., ecotype).
data<- transform(data,ecotype=as.factor(ecotype))
train= data

# ----------------------------------------
# Train the RF model
# ----------------------------------------

# replace x by best mtry for your data - Replace xxxxx by the best ntree for your data
Model_Tunned <- randomForest(ecotype ~ ., data = train, mtry =x , importance=TRUE, ntree = xxxxx, tuneLength = 5, metric = "ROC", trControl = ctrl,strata = data$ecotype)


# Display variable importance
importance(Model_Tunned)
varImpPlot(Model_Tunned)

# ----------------------------------------
# Test the RF Model
# ----------------------------------------

# Load test dataset
dataTest<- read.csv("TestingData.csv", sep=",", header= TRUE)

# Make predictions 
pred = predict(Model_Tunned, newdata=dataTest)

# Calculate accuracy
sum(pred==dataTest$ecotype) / nrow(dataTest)

# Create Confusion Matrix
cm = table(dataTest[,x], pred) # x correspond to the column where the ecotypes are.
conf_matrix_df <- confusionMatrix(cm, positive = NULL,prevalence = NULL)
conf_matrix_df

HCA SSL

# Load required libraries
library(factoextra)
library(cluster)
library(ggdendro)
library(ggplot2)
library(ape)
library(dendextend)

# ----------------------------------------
# Data Preparation
# ----------------------------------------

# Load dataset (Ensure the CSV file is in the working directory)
data<- read.csv("PCoordBentSym.csv", sep=",", header= TRUE)

# Convert Your categorical variable to a factor
data <- transform(data,YourGroup=as.factor(YourGroup))

# ----------------------------------------
# Data Scaling
# ----------------------------------------

# Standardize the data
data.scaled <- scale(data)

# Compute Euclidean distance matrix
d <- dist(data.scaled, method = "euclidean")

# ----------------------------------------
# Optimal Cluster Number Selection (Gap Statistic)
# ----------------------------------------

gap_stat <- clusGap(scaled_data, FUN = hcut, nstart = 25, K.max = 20, B = 200)

# Visualize the gap statistic
fviz_gap_stat(gap_stat)

# ----------------------------------------
# Hierarchical Clustering
# ----------------------------------------

# Compute hierarchical clustering using Ward’s method
final_clust <- hclust(d, method = "ward.D2" )

# Cut tree into predefined clusters (replace x with the determined optimal number)
groups <- cutree(final_clust, k=x)

# ----------------------------------------
# Save Cluster Results
# ----------------------------------------

YourGroup<- c(data$YourGroup)
LabID<- c(data$LabID)

# Combine LabID, assigned cluster groups, and original Ecotype
results<-cbind(LabID, CLUSTER, YourGroup)

# Convert into a data Frame
results1<-as.data.frame(results)
colnames(results1)<-c("LabID",'CLUSTER','YourGroup')

# Export results to CSV
write.table(results1, file="PredictedClusters.csv", row.names = FALSE)

# ----------------------------------------
# Dendrogram Visualization
# ----------------------------------------

# Assign labels to the dendrogram
final_clust$labels <- data$YourGroup

# Create dendrogram plot - Replace x by determines k number
dend_plot<- fviz_dend(final_clust, rect = TRUE, cex = 0.5, k = x, 
main = "YourName",
xlab = "YourGroup", ylab = "Distance", sub = "",
ggtheme = theme_minimal(), k_colors = "simpsons",
color_labels_by_k = FALSE, type = "rectangle")

# Display the dendrogram
dend_plot

# Save the dendrogram as an image
ggsave("dendrogram.png", plot = dend_plot)
A local Image
A local Image

2B-PLS

# Set working directory (modify as needed)
setwd("INSERT_PATH_HERE")

# Load required libraries
library(readxl)
library(geomorph)

# ----------------------------------------
# Load Data
# ----------------------------------------

# Load the shape data (symmetry-adjusted)

load("YourName_sym.dt.bin")
Shape <- YourName_sym.dt$symm.sh

# Load environmental and Polygon data
Polygon <- read_excel("path/to/polygon.xlsx") 
Env_Variable <- read_excel("path/to/Environmental_Var.xlsx")

# ----------------------------------------
# Data Preparation
# ----------------------------------------

# Convert environmental variables to numeric
X1 <- Shape
Y1 <- as.data.frame(lapply(Env_Variable, as.numeric))

# ----------------------------------------
# Data Scaling
# ----------------------------------------

scaled_Y1 <- scale(Y1)

# ----------------------------------------
# PLS Analysis
# ----------------------------------------

PLS1 <-two.b.pls(X1,scaled_Y1,iter=999, seed=NULL, print.progress = TRUE)

# Display summary statistics
summary(PLS1)

# Cumulative variance explained
cumsum(explvar(PLS1))  

# ----------------------------------------
# PLS Scatter Plot
# ----------------------------------------

# Define color palette
cols <- c("YourColour1", "YourColour2", "YourColour3", "etc")

# Generate PLS plot
P <- plot(PLS1, col = cols[as.factor(YourGroup$YourGroup)], pch = 19, cex = 1.5)

# Add legend
legend("topleft", legend = unique(as.factor(YourGroup$YourGroup)), 
       col=c(unique(cols[as.factor(YourGroup$YourGroup)])), 
       pt.cex = 2, cex = 1, pch = 19)

# ----------------------------------------
# PLS Loadings Barplot
# ----------------------------------------

# Extract and sort loadings
loadings_X <- PLS1$right.pls.vectors[,1]
sorted_indices <- order(abs(loadings_X), decreasing = TRUE)
sorted_loadings <- loadings_X[sorted_indices]

# Create a sorted environmental variable names vector
sorted_env_variables <- colnames(Env_Variable)[sorted_indices]

# Generate barplot
barplot(sorted_loadings, beside = TRUE,
        main = "YourName",
        xlab = "Shape", ylab = "Environment",
        names.arg = sorted_env_variables,
        col = "magenta", border = "white", space = 0.2, horiz = TRUE, las = 2)
A local Image
A local Image

RDA

# Set working directory (modify as needed)
setwd("INSERT_PATH_HERE")

# Load required libraries
library(vegan)
library(geomorph)
library(readxl)
library(ggvegan)
library(ggplot2)
library(vegan3d)

# ----------------------------------------
# Load Data
# ----------------------------------------

# Load shape data
load("YourName_sym.dt.bin")  
Shape <- YourName_sym.dt$symm.sh  

# Convert shape data to a 2D array
ShapesData <- two.d.array(Shape, sep = ".")

# Load environmental and regional data
Polygon <- read_excel("path/to/polygon.xlsx")  
Env_Variable <- read_excel("path/to/Environmental_Var.xlsx")  

# ----------------------------------------
# Data Preparation
# ----------------------------------------
Ecotype <- Ecotype

# Convert environmental variables to numeric - Replace numbers by your column
Env_variable_Numeric <- sapply(Env_Variable[1:17], as.numeric)

# ----------------------------------------
# Data Scaling
# ----------------------------------------

Scaled_Env_Variable <- scale(Env_variable_Numeric)

# Convert to dataframe if necessary
if (!is.data.frame(Scaled_Env_Variable)) {
  Scaled_Env_Variable <- as.data.frame(Scaled_Env_Variable)
}

# ----------------------------------------
# RDA Analysis
# ----------------------------------------

rda_result <- rda(ShapesData ~ ., data = Scaled_Env_Variable)

# Display summary statistics
summary(rda_result)

# ----------------------------------------
# Significance Testing
# ----------------------------------------

Anova.RDA.Overall <- anova.cca(rda_result)  # Overall significance
Anova.RDA.terms <- anova.cca(rda_result, by = "terms")  # Individual predictor significance
Anova.RDA.margin <- anova.cca(rda_result, by = "margin")  # Overall contribution of terms
Anova.RDA.onedf <- anova.cca(rda_result, by = "onedf")  # Collective explanatory power
Anova.RDA.axis <- anova.cca(rda_result, by = "axis")  # Axis significance

# ----------------------------------------
# Variance Explained by Each Environmental Variable
# ----------------------------------------

# Define variances - Replace values by your values
variances <- c(
  SalinityMean = 0.00005464, SalinityRange = 0.00006634, SilicateMean = 0.00008201,
  TemperatureMean = 0.00005589, Aspect = 0.00002771, MLDepthMean = 0.00002539,
  NitrateMean = 0.00000366, PhMean = 0.00001173, Slope = 0.00000604,
  ChlorophyllMean = 0.00002295, DissolvedO2Mean = 0.0000164, DissolvedO2Range = 0.00000993,
  CurrentDirectionMean = 0.00001241, CurrentDirectionRange = 0.00004396,
  CurrentVelocityMean = 0.00000776, BathymetryMean = 0.00001218, TopographicPosition = 0.00000297)

# Calculate total variance
total_variance <- sum(variances)

# Calculate percentage of variance explained by each variable
percent_variance <- (variances / total_variance) * 100

# Display the results
percent_variance

# ----------------------------------------
# RDA Biplots
# ----------------------------------------

# Standard 2D biplot
autoplot(rda_result, arrows = TRUE, geom = "text", legend = "none")

# 3D biplot using vegan3d
cols <- c("YourColour1","YourColour2","YourColour3", "etc")

ordirgl(rda_result, display = "site", cex = 0.5, choices = 1:3, 
        ax.col = "red", arr.len = 0.05, arr.col = "blue", pch = 16, 
        col = cols[as.factor(Ecotype$Ecotype)])

# Alternative 3D plot
ordiplot3d(rda_result, display = "site", choices = 1:3, ax.col = "black", 
           arr.len = 0.1, arr.col = "green", col = cols[as.factor(Ecotype$Ecotype)], pch = 20)

# ----------------------------------------
# Statistical Tests on Covariates
# ----------------------------------------

# Permutation test for axis significance
permutation_test <- anova.cca(rda_result)

# Extract p-values for each axis
axis_pvalues <- permutation_test$CCA$pvals

# Permutation test for individual variable significance
variable_pvalues <- anova.cca(rda_result, by = "terms")$CCA$pvals

# ----------------------------------------
# Correlation Between Environmental Variables
# ----------------------------------------

Correlation <- cor(Env_Variable)

# Save correlation matrix
write.table(Correlation, file = "Env_Variable_Correlation.tsv", sep = "\t")

# Identify variables with high correlation (≥ 0.6)
high_correlation_variables <- which(Correlation >= 0.6 & Correlation < 1, arr.ind = TRUE)

# Display results
print(high_correlation_variables)
A local Image
A local Image